NBIS ID: SMS-6112
Report Version: 1.0
Request by: Mattias Westman ()
Principal Investigator: Carl-Johan Sundberg ()
Organisation: KI
NBIS Staff: Juliana Assis ()


1 Setup

source('https://raw.githubusercontent.com/nimarafati/RNA_Seq_Pipeline/master/RNA_Seq/DE_script.R')
## Paths
path <- '/Users/juliana/Documents/GitHub/SMS-6112-22-MB/'

## LIBRARIES
# data handling
library(dplyr)
library(tidyr)
library(stringr)

# tables
library(kableExtra) # complete table
library(formattable) # table with conditional formatting
library(DT) # interactive table

# graphics
library(ggplot2) # static graphics

# interactive graphics
library(highcharter)
library(plotly)
library(ggiraph) # convert ggplot to interactive
library(dygraphs) # time series
library(networkD3) # network graph
library(leaflet) # interactive maps
library(crosstalk) # linking plots

# analysis
library("devtools")
library("bsselectR")
library("dplyr")
library("ggplot2")
library("tidyr")
library("DECIPHER")
library("gplots")
library("gridExtra")
library("grid")
library("ggpubr")
library("reshape2")
library("reshape")
library("ggrepel")
library("ggh4x")
library("RColorBrewer")
library("tidyverse")
library("DESeq2") # Bioconductor
library('ggfortify')
library("AnnotationDbi")
library("FactoMineR")
library("factoextra")
library("PCAtools")
library("Rqc")
library("gt")
library("clusterProfiler")
library("rnaseqGene")
#library("refGenome")
library("plotly")
library("biomaRt")
library("ggridges")
library("viridis")
library("hrbrthemes")

## Functions
#Perform DE analysis
perform_DE <- function(dds, contrast_name, log2FC_cutoff, padjust_cutoff, file_name, volcano, genes, file_path){
  res_obj <- results(dds, contrast = contrast_name, lfcThreshold = log2FC_cutoff, alpha = padjust_cutoff)
  res <- res_obj
  res$gene_id <- rownames(res_obj)
  res <- inner_join(x = as.data.frame(res), y = genes, by = 'gene_id')  
  res <- res[!is.na(res$padj),]
  res$diffexpressed <- 'No' 
  res$diffexpressed[(res$log2FoldChange >= log2FC_cutoff & res$padj <= 0.05)] <- 'Up-regulated'
  res$diffexpressed[(res$log2FoldChange <= -log2FC_cutoff & res$padj <= 0.05)] <- 'Down-regulated'
  res$delabel <- 'NA'
  res[(res$diffexpressed != 'No'), 'delabel'] <- res[(res$diffexpressed != 'No'),'gene_name']
  # Volcano
  if(volcano == T){
    png(paste0(file_path, file_name, '_volcano.png'), width = 3000, height = 2000, res = 300)
    p <- ggplot(data= as.data.frame(res), aes(x = log2FoldChange, y = -log10(padj), col = diffexpressed, label = delabel)) + 
    geom_point() + 
    theme_classic() +
    scale_colour_manual(values = c('blue', 'black', 'red')) +
    geom_vline(xintercept = c(-log2FC_cutoff,log2FC_cutoff), col = 'red') +
    geom_hline(yintercept = -log10(padjust_cutoff), col = 'red') +
    ggtitle(file_name) +
    theme(plot.title = element_text(hjust = 0.5))
    print(p)
    dev.off()

    # ntd <- normTransform(dds)
    # select <- res[(res$diffexpressed == 'Down-regulated'), 'gene_id']
    # df <- as.data.frame(colData(dds)[,c('Cell_group', 'Abeta', 'Sex')]) 
    # pheatmap(assay(ntd)[select,], 
    #          cluster_rows = F,
    #          cluster_cols = T,
    #          show_rownames = F,
    #          annotation_col = df)
  }
  write.table(res, paste0(file_path, 'DE_results.txt'), col.names = T, row.names = F, sep = '\t', quote = F)
  res_results <- list(DE_table = res,
                      Volcanot_plot = p,
                      res_obj = res_obj)
  saveRDS(res_results, paste0(file_path, 'res_results.rds'))
  return(res_results)
}

# Plot PC3, PC4
plotPCA_PCs <- function (object, ...) 
{
    .local <- function (object, intgroup = "condition", ntop = 500, 
        returnData = FALSE) 
    {
        rv <- rowVars(assay(object))
        select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, 
            length(rv)))]
        pca <- prcomp(t(assay(object)[select, ]))
        percentVar <- pca$sdev^2/sum(pca$sdev^2)
        if (!all(intgroup %in% names(colData(object)))) {
            stop("the argument 'intgroup' should specify columns of colData(dds)")
        }
        intgroup.df <- as.data.frame(colData(object)[, intgroup, 
            drop = FALSE])
        group <- if (length(intgroup) > 1) {
            factor(apply(intgroup.df, 1, paste, collapse = ":"))
        }
        else {
            colData(object)[[intgroup]]
        }
        d <- data.frame(PC1 = pca$x[, PC1], PC2 = pca$x[, PC2], group = group, 
            intgroup.df, name = colnames(object))
        if (returnData) {
            attr(d, "percentVar") <- percentVar[1:length(percentVar)]
            return(d)
        }
            geom_point(size = 3) + xlab(paste0(PC1, ": ", round(percentVar[PC1] * 
            100), "% variance")) + ylab(paste0(PC2, ": ", round(percentVar[PC2] * 
            100), "% variance")) + coord_fixed()
    }
    .local(object, ...)
}


ridgeline_plot <- function(ego_result, plot_name, fc_sorted){
  t <- ego_result
  t <- t %>% mutate(geneID = strsplit(as.character(geneID), '/')) %>% unnest(geneID) %>% filter(geneID != '') %>% dplyr::select(geneID, c('ONTOLOGY', 'Description', 'geneID'))
  t <- data.frame(t, logFC = fc_sorted[t$geneID])
  
  for(ont in unique(t$ONTOLOGY)){
    png(paste0(plot_name, '_', ont, '.png'), res = 200, height = 2000, width = 2000)
    p <- ggplot(t[(t$ONTOLOGY == ont),], aes(x = logFC, y = Description, fill = ..x..)) +
    geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
    scale_fill_viridis(name = "logFC", option = "C") +
    labs(title = paste0('logFC distribution of genes in enriched GO terms (', ont, ')')) +
    theme_ipsum() +
      theme(
        legend.position="none",
        panel.spacing = unit(0.1, "lines"),
        strip.text.x = element_text(size = 8)
      )
    print(p)
    dev.off()
  }
}


## Enrichment analysis
### GO  
run_ego <- function(DEG_entrez_id, OrgDb, ont, all_entrez_id, min_gene_Size, max_gene_Size, path_to_write) {
    setwd(path_to_write)
    ego <- clusterProfiler::enrichGO(DEG$entrezgene_id, 
                                     OrgDb = org_db, 
                                     ont = ontology_group, 
                                     universe = all_entrez_id, 
                                     minGSSize = min_gene_Size, 
                                     maxGSSize = max_gene_Size)

      if(nrow(ego) > 0 && !is.null(ego)){
        write.table(ego , 'enrichGO_results.txt', col.names = T, row.names = F, quote = F, sep = '\t')
        p_ego<- clusterProfiler::dotplot(ego, split="ONTOLOGY", font.size = 8, showCategory = nrow(ego), title = 'GO enrichment analysis') + facet_grid(ONTOLOGY~., scale="free") + theme(plot.title = element_text(hjust = 0.5))
        ## ridgeline plot
        ridgeline_plot(ego_result = ego@result, plot_name = 'ridgeline_FDR_0.05', fc_sorted = fc_sorted)
      }else{
        write.table('No GO enrichment result' , 'enrichGO_results.txt', col.names = T, row.names = F, quote = F, sep = '\t')
        p_ego <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
      }
      print(p_ego)
      dev.off()
    return(ego)  
  }
#### GSEGO
run_gsego <- function (fc_sorted, OrgDb, ontology_group, keyType, min_gene_Size, max_gene_Size, pvalueCutoff,  path_to_write){
    setwd(path_to_write)
    gsego <- clusterProfiler::gseGO(geneList=fc_sorted, 
             ont = ontology_group, 
             keyType = keyType, 
             minGSSize = min_gene_Size, 
             maxGSSize = max_gene_Size, 
             pvalueCutoff = pvalueCutoff, 
             pAdjustMethod = 'BH',
             verbose = TRUE, 
             OrgDb = org_db)
    
    if(!is.null(nrow(gsego)) && nrow(gsego) > 0 ){
        write.table(gsego , 'GSEA_GO_FDR_0.05.txt', col.names = T, row.names = F, quote = F, sep = '\t')
        
        ## dotplot
        png('dotplot_GSEA_GO_FDR_0.05.png', res = 200, height = 2000, width = 2000)
        p_gsego_dotplot <- clusterProfiler::dotplot(gsego, showCategory=nrow(gsego), split=".sign", font.size = 8, title = 'Gene set enrichment analysis of GO terms' ) + facet_grid(.~.sign) + theme(plot.title = element_text(hjust = 0.5))
        print(p_gsego_dotplot)
        dev.off()
        
        ## Gene set enrichment plot for each significant term
        cntr <- 1
        for(go_id in as.character(gsego@result$ID)){
          cat('\r', paste0('visualising ',go_id,' GO'))
          Description <- gsub( x = gsub(pattern = '/', '_', perl = T,gsego$Description[cntr]), replacement = "_", pattern = ' ')
          png(paste0('GSEA_GO_FDR_0.05_', Description, '.png'), res = 200, height = 2000, width = 2000)
          p_gsego_gse <- clusterProfiler::gseaplot(gsego, by = ontology_group, title = gsego$Description[cntr], geneSetID = cntr)
          print(p_gsego_gse)
          dev.off()
          cntr <- cntr + 1
        }
      }else{
        write.table('No GSEA GO result', 'GSEA_GO_FDR_0.05.txt', col.names = F, row.names = F, quote = F, sep = '\t')
        
        png('dotplot_GSEA_GO_FDR_0.05.png', res = 200, height = 2000, width = 2000)
        p_gsego_dotplot <-  ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
        print(p_gsego_dotplot)
        dev.off()
      }
    return(gsego)
  }

### KEGG  
run_ekegg <- function (fc_sorted, organism, keyType, all_entrez_id, minGSSize, pvalueCutoff, path_to_write){
  setwd(path_to_write)
    ekegg <- clusterProfiler::enrichKEGG(names(fc_sorted), organism = organism, keyType = keyType, pvalueCutoff = pvalueCutoff, pAdjustMethod = "BH", universe = as.character(all_entrez_id), maxGSSize = 1000, minGSSize = 10)
    ekegg@result$ratio <- sapply(strsplit(ekegg@result$GeneRatio, "/"), function(x) as.numeric(x[1])/as.numeric(x[2]))
  if(!is.null(nrow(ekegg)) && nrow(ekegg) > 0 ){
    write.table(ekegg , 'enrichKEGG_results.txt', col.names = T, row.names = F, quote = F, sep = '\t')
    ekegg_FDR_0.05 <- ekegg
    ekegg_FDR_0.05@result <- ekegg@result[(ekegg@result$p.adjust <= 0.05), ]
    
    png('dotplot_KEGG_Enrichment_FDR_0.05.png', res = 200, height = 2000, width = 2000)
  
    p_ekegg <- clusterProfiler::dotplot(ekegg_FDR_0.05, font.size = 8, showCategory = nrow(ekegg_FDR_0.05), title = 'KEGG enrichment analysis') + theme(plot.title = element_text(hjust = 0.5)) 
    print(p_ekegg)
    dev.off()
  }else{
    write.table('No KEGG enrichment result' , 'enrichKEGG_results.txt', col.names = F, row.names = F, quote = F, sep = '\t')
    png('dotplot_KEGG_Enrichment_FDR_0.05.png', res = 200, height = 2000, width = 2000)
    p_gsego_dotplot <-  ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
    print(p_gsego_dotplot)
    dev.off()
    
    p_ekegg <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
    print(p_ekegg)
    dev.off()
    return(ekegg)
  }
}

#### GSEA_KEGG
run_gsekegg <- function(fc_sorted, organism, keyType, eps = 0, minGSSize, pvalueCutoff, path_to_write){
    setwd(path_to_write)
    gsekegg <- clusterProfiler::gseKEGG(fc_sorted, organism = org, keyType = gene_type_conversion, pvalueCutoff = pvalueCutoff, eps = eps, minGSSize = min_gene_Size, maxGSSize = max_gene_Size, pAdjustMethod = "BH", nPermSimple = 10000)
   if( !is.null(nrow(gsekegg)) && nrow(gsekegg) > 0){
    gsekegg_FDR_0.05 <- gsekegg
    gsekegg_FDR_0.05@result <- gsekegg@result[(gsekegg@result$p.adjust <= 0.05), ]
    write.table(gsekegg_FDR_0.05 , 'GSEA_KEGG_FDR_0.05.txt', col.names = T, row.names = F, quote = F, sep = '\t')
    png('dotplot_GSEA_KEGG_FDR_0.05.png', res = 200, height = 2000, width = 2000)
    p_gsekegg_dotplot <- clusterProfiler::dotplot(gsekegg_FDR_0.05, showCategory = 10, title = "Enriched Pathways" , split=".sign") + facet_grid(.~.sign) + theme(plot.title = element_text(hjust = 0.5))
    print(p_gsekegg_dotplot)
    dev.off()
  }else{
    write.table('NO GSEA KEGG result' , 'GSEA_KEGG_FDR_0.05.txt', col.names = F, row.names = F, quote = F, sep = '\t')
    png('dotplot_GSEA_KEGG_FDR_0.05.png', res = 200, height = 2000, width = 2000)
    p_gsekegg_dotplot <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
    print(p_gsekegg_dotplot)
    dev.off()
  }
}

#### pathview
run_pathview <- function(gsekegg_FDR_0.05, fc_sorted, org, path_to_write){
  setwd(path_to_write)
  if(!is.null(nrow(gsekegg_FDR_0.05)) && nrow(gsekegg_FDR_0.05) >0 ){
    cntr <- 1
    for(ptw in as.character(gsekegg_FDR_0.05@result$ID)){
      cat('\r', paste0('Saving ',ptw,' pathway'))
      Description <- gsub( x = gsub(pattern = '/', '_', perl = T,gsekegg_FDR_0.05$Description[cntr]), replacement = "_", pattern = ' ')
      pathview(gene.data=fc_sorted, pathway.id=ptw, species = org, same.layer = 2)
      png(paste0('GSEA_KEGG_FDR_0.05_', Description, '.png'), res = 200, height = 2000, width = 2000)
      p <- gseaplot(gsekegg_FDR_0.05, by = "all", title = gsekegg_FDR_0.05$Description[cntr], geneSetID = cntr)
      print(p)
      dev.off()
      cntr <- cntr + 1
    }
  }else{
    png(paste0('GSEA_KEGG_FDR_0.05.png'), res = 200, height = 2000, width = 2000)
    p <- ggplot() + theme_void() + ggtitle('No significant term/category/pathway')
    print(p)
    dev.off()
  }
}

2 Background

Myotis brandtii are known to have at least a ten-fold longer life expectancy compared to mice of similar body mass, and the highest relative longevity of any mammals. To this end we have collected fresh snap-frozen complete organ systems from these bats during both winter hibernation and summer active period. We thus want to compare gene expressions and control of gene expressions across organs and between these groups. The whole genome is sequenced and available, but the pipeline from sequences to count tables is not established.

The list of contrasts of interest is:

genes based on the row-wise variance (no statistical support)

-Variant between Summer vs Winter (Bat1 vs Bat2)
-Variant between Organs and S/W
> –Intestine-S & Intestine-W,
–Kidney-S & Kidney-W, –Skeletal_muscle-S & Skeletal_muscle-W

2.1 Data

There are 12 samples where 5 belongs to Library1, 5 to Library2 and 2 to Library3

  • Type of data
    bulk RNASeq

  • Data location

cd /proj/snic2022-22-672/private/SMS_6112_RNASeq_Bat_Library1_2_3/

  • Uppmax project ID

SNIC 2022/23-355 SNIC Small Storage
SNIC 2022/22-672 SNIC Small Compute

  • Reference data used

NCBI Myotis brandtii Annotation Release 101

Partial genome with low quality annotation

3 Tools

nfcore-rnaseq

Trinity not shown

3.1 Methods

Reads were aligned to to the Myotis brandtii Annotation Release 101 genome by using nf-core/rnaseq pipeline (3.9) in Reproducibility section you can find all the tools with version used in nf-core pipeline. For downstream analyses we used expression values generated by.

In following table you can find list of samples and metadata.

3.2 Loading annotation

library("GenomicFeatures")
library("ensembldb")
gtf2 <- rtracklayer::import("/Users/juliana/Documents/GitHub/SMS-6112-22-MB/data/GCF_000412655.1_ASM41265v1_genomic.gtf")

TxDb <- makeTxDbFromGFF("/Users/juliana/Documents/GitHub/SMS-6112-22-MB/data/GCF_000412655.1_ASM41265v1_genomic.gtf")

# Convert to gene names
mart <- useMart("ENSEMBL_MART_ENSEMBL")
mart <- useDataset(mart=mart,dataset="mlucifugus_gene_ensembl")
myattributes <- c("ensembl_gene_id",
                  "entrezgene_id",
                  "external_gene_name",
                  "chromosome_name",
                  "start_position",
                  "end_position",
                  "strand",
                  "gene_biotype",
                  "description")
bdata <- getBM(mart=mart,attributes=myattributes,uniqueRows=T,
               useCache=FALSE)
# remove duplicated gene_ids
annotation <- bdata[!duplicated(bdata$ensembl_gene_id),c('ensembl_gene_id', 'external_gene_name', 'entrezgene_id')]
colnames(annotation) <- c('gene_id', 'Name', 'entrezgene_id')
annotation[(annotation$Name == ''), 'Name'] <- 'Uncharacterised'
write.table(annotation, paste0('/Users/juliana/Documents/GitHub/SMS-6112-22-MB/results/annotation.txt'), col.names = T, row.names = F, sep = '\t')
#ensDbFromAH

3.3 Loading the data

Here we load normalized counts scaled for length generated by salmon (version) [@] in nf-core/rnaseq pipeline. We tested different model and dependent on the contrast we chose one of them for downstream analysis.

exp <- readRDS("/Users/juliana/Documents/GitHub/SMS-6112-22-MB/results/ALL_Libraries/R_Files/salmon.merged.gene_counts_length_scaled.rds")
exp2 <- exp

assay(exp, 1) <- round(data.matrix(assay(exp, 1)))
assay(exp, 2) <- NULL

# Add metadata
exp2$sample_id <- as.factor(samples_info$sample_id)
exp2$bat <- as.factor(samples_info$bat)
exp2$organ <- as.factor(samples_info$organ)
exp2$library <- as.factor(samples_info$library)
exp2$organ_bat <- as.factor(samples_info$organ_bat)

counts <- assay(exp2, "counts")
dds <- DESeqDataSetFromMatrix(countData = round(counts),
                              colData = samples_info,
                              design = ~ 0 + organ + bat)

3.4 Filtering

We filter out low-count genes before downstream analysis by using at least 20 reads in total (including all samples).

keep <- rowSums(counts(dds)) >= 20
dds <- dds[keep,]

3.5 PCA

This high-dimensional count data can - after normalization - be visualized in two dimensions in a Principal Component Analysis (PCA). Using PCA, we examine the structure of the data by projecting the samples on a two-dimensional graph using the two first principal components that explain the most variation between those samples. The plot can be used to examine the samples for outliers, sample swaps and other relationships. When normalization successfully removed technical artefacts, the relative distances should be biologically interpretable.

cols_organ <- c("Skeletal muscle" = "#264D59", "Kidney" = "#77A515", "Intestine" = "#8B4513")

vsd <- vst(dds, blind = F)
plotPCA(vsd, "library")
#assay(vsd) <- limma::removeBatchEffect(assay(vsd), vsd$library)
#plotPCA(vsd, "library")

#topVarGenes_p <- head( order( rowVars( assay(vsd) ), decreasing=TRUE ), 500 )

a <- plotPCA(vsd, intgroup = c('sample_id', 'bat', 'organ' ,'library'), returnData = TRUE)

# By organ

teste <- plotPCA(vsd, "bat", "organ")
#$y
#[1] "PC2: 22% variance"
#$x
#[1] "PC1: 51% variance"

png(paste0(path, '/results/ALL_Libraries/results/01_DE/PCA_bat_organ_L1_2_3.png'), width = 4000, height = 3000, res = 300)
#teste$labels get variance

ggplot(a, aes(x = PC1, y = PC2, color = organ, shape = bat)) + 
  geom_point(size = 4, alpha = 0.8) +
               geom_hline(yintercept = 0, linetype = 'dashed', alpha = 0.3) + 
               geom_vline(xintercept = 0, linetype = 'dashed', alpha = 0.3) +
               scale_shape_manual(values=c(15,17)) +
               theme_pubr(border = TRUE) +
               #coord_fixed(ratio = 1) +
               geom_text_repel(aes(label = library), nudge_x = 0.06, size = 5.0, segment.alpha = 0.8) +
               theme(axis.text=element_text(size=14), 
                     axis.text.x = element_text(size = 12, hjust = 0.5), 
                     axis.title.y = element_text(size = 18),
                     legend.text=element_text(size=14), 
                     legend.title=element_text(size=0),
                     legend.position="bottom",
                     axis.title.x = element_text(size = 18), 
                     strip.text.x = element_text(size = 20, face = "bold")) +
              labs(x="PC1: 51% variance", y = "PC2: 22% variance", element_text(face = "bold")) +
              scale_color_manual(values = cols_organ)
dev.off()

3.6 Gene Variation

#The rlog transform
#https://bioc.ism.ac.jp/packages/2.14/bioc/vignettes/DESeq2/inst/doc/beginner.pdf
#highly variable genes. These are the genes most variable across all samples regardless of which samples they are
#rowVars on the output of vst as this corrects for the biased mean/variance trend and puts data on the log scale

rld <- rlog(dds, blind=F )

# Heatmap of Euclidean sample distances after rlog transformation.
sampleDists <- dist( t( assay(rld) ) )
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( rld$sample_id, rld$organ, rld$bat , sep="-" )

png(paste0(path, '/results/ALL_Libraries/results/01_DE/HeatMap_rlog_L1_2_3.png'), width = 4000, height = 3000, res = 300)
colours = colorRampPalette(rev(brewer.pal(9, "Blues")) )(255)
d <- pheatmap(sampleDistMatrix, trace="none", col=colours)
d
dev.off()

#Gene clustering
topVarGenes <- head(order(rowVars(assay(rld)), decreasing=TRUE ), 100)

df <- as.data.frame(colData(dds)[,c("bat","organ", "library")])
df$bat = factor(df$bat, levels = c("Bat_1", "Bat_2"), labels = c("Bat_1", "Bat_2"))
metaR_bat <- c("#C0C0C0","#BC8F8F") 
names(metaR_bat) <- levels(df$bat)

df$organ = factor(df$organ, levels = c("Skeletal muscle", "Kidney","Intestine")) 
metaR_organ <- c("#264D59", "#77A515", "#8B4513") 
names(metaR_organ) <- levels(df$organ)

metaR_AnnColour <- list(
  bat = metaR_bat)

metaR_AnnColour2 <- list(
  organ = metaR_organ)

metaR_AnnColourx <- list(
  organ = metaR_organ,
  bat = metaR_bat)

# Check the output
metaR_AnnColourx

SampleOrder = order(df$organ, df$bat)
meta.factors <- df[1:2]

df[1:2]

#png(paste0(path, '/results/ALL_Libraries/results/01_DEHeatMap_topvar500_L1.png'), width = 4000, height = 15500, res = 300)
png(paste0(path, '/results/ALL_Libraries/results/01_DE/HeatMap_topvar100_L1_2_3.png'), width = 4000, height = 8000, res = 300)

colsHeat<- c("#F7F7F7", "#92C5DE", "#0571B0", "#F4A582", "#CA0020")
pheatmap2 <- pheatmap(assay(rld)[topVarGenes, SampleOrder],
         cluster_cols = FALSE,
         cluster_rows = TRUE,
         gaps_row = 5, 
         clustering_distance_rows = "euclidean",
         clustering_distance_cols  = "euclidean",
         annotation_colors = metaR_AnnColourx, annotation_col = meta.factors,  
         show_colnames = TRUE,
         #color = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255),
         color = colorRampPalette(c(colsHeat))(255),
         border_color = "#f8edeb",
         display_numbers = FALSE)
pheatmap2
dev.off()
dev.off()


##

topVarGenes1000 <- head(order(rowVars(assay(rld)), decreasing=TRUE ), 1000)
mat  <- assay(vsd)[ topVarGenes1000, ]
mat  <- mat - rowMeans(mat)
#anno <- as.data.frame(colData(vsd)[, c("bat","organ")])
#pheatmap(mat, annotation_col = anno)
write.table(mat,file='/Users/juliana/Documents/GitHub/SMS-6112-22-MB/results/ALL_Libraries/results/01_DE/topVarGenes1000_L1_2_3.tsv',quote=FALSE,sep="\t")

##

png(paste0(path, '/results/ALL_Libraries/results/01_DE/HeatMap_topvar100_clus_1_L2_3.png'), width = 5000, height = 8000, res = 300)

#png(paste0(path, '/results/ALL_Libraries/results/01_DEHeatMap_topvar500_clus_L1.png'), width = 5000, height = 15500, res = 300)
pheatmap3 <- pheatmap(assay(rld)[topVarGenes, SampleOrder],
         cluster_cols = TRUE,
         cluster_rows = TRUE,
         gaps_row = 5, 
        scale="row",
         clustering_distance_rows = "euclidean",
         clustering_distance_cols  = "euclidean",
         annotation_colors = metaR_AnnColourx, annotation_col = meta.factors,  
         show_colnames = TRUE,
         #color = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255),
         color = colorRampPalette(c(colsHeat))(255),
         border_color = "#f8edeb",
         display_numbers = FALSE)
pheatmap3
dev.off()

3.7 Distribution of expression

counts_norm <- reshape2::melt(counts(dds2, normalized = T), varnames = c('gene_id', 'sample_name'), value.name = 'counts')
counts_norm <- inner_join(x = counts_norm, y = as.data.frame(dds2@colData), by = 'sample_name')


png(paste0(path, '/results/ALL_Libraries/results/01_DE/distribution_L1_2_3.png'), width = 3000, height = 2500, res = 300)
distribution <- ggplot(data = counts_norm, aes(x = reorder(sample_name, bat, na.rm = T), y = log2(counts+1))) +  geom_boxplot(aes(fill = factor(counts_norm$organ))) + coord_flip() + xlab('sample_name') + scale_color_manual(values = cols_organ) + theme_pubr(border = TRUE)
distribution
dev.off()

4 Results

4.1 Filtering

We filter out low-count genes before downstream analysis by using at least 20 reads in total (including all samples) by which we removed 5932 genes and kept 23060 genes for downstream analysis.

The distribution of normalized expression in all samples are shown in figure 1.

4.2 Sample correlation

There is a good correlation among samples as shown in figure 2. You can see that organs formed separate cluster.

Heatmap of Euclidean sample distances after rlog transformation

4.3 PCA

In figures ?? and ?? each point corresponds to a sample plotted on PC1 and PC2. I can see strong separation by organ . Library batch effect has been removed.

PCA all

4.4 Highly variable genes

4.4.1 All Samples

4.4.1.1 These are the genes (top 100) most variable across all samples regardless of which samples they are.

TopVar genes results. Table below shows list of 100 genes

TopVar Clustered independent of the sample

4.5 Annotation

Table with the information of the top 1000 genes

Table with all genes can be found at the Github.

4.6 GO analysis

Here, you can find the the GO profile of the topVar 1000 genes

Dinamic Link

GO Terms

5 Further Work - Limitations

We can’t go further due to the limitation in the experimental design

6 References

  • Experimental design here

7 Deliverables

Files delivered to the user with descriptions.

rmarkdown::render("report.Rmd")

8 Reproducibility

## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] hrbrthemes_0.8.0            viridis_0.6.2              
##  [3] viridisLite_0.4.1           ggridges_0.5.4             
##  [5] biomaRt_2.52.0              rnaseqGene_1.20.0          
##  [7] fission_1.16.0              RUVSeq_1.30.0              
##  [9] EDASeq_2.30.0               sva_3.44.0                 
## [11] mgcv_1.8-41                 nlme_3.1-160               
## [13] Gviz_1.40.1                 ReportingTools_2.36.0      
## [15] org.Hs.eg.db_3.15.0         genefilter_1.78.0          
## [17] ggbeeswarm_0.6.0            glmpca_0.2.0               
## [19] PoiClaClu_1.0.2.1           pheatmap_1.0.12            
## [21] hexbin_1.28.2               vsn_3.64.0                 
## [23] apeglm_1.18.0               magrittr_2.0.3             
## [25] tximeta_1.14.1              airway_1.16.0              
## [27] BiocStyle_2.24.0            gt_0.8.0                   
## [29] Rqc_1.30.0                  ShortRead_1.54.0           
## [31] GenomicAlignments_1.32.1    Rsamtools_2.12.0           
## [33] BiocParallel_1.30.4         PCAtools_2.8.0             
## [35] factoextra_1.0.7.999        FactoMineR_2.6             
## [37] AnnotationDbi_1.58.0        ggfortify_0.4.15           
## [39] DESeq2_1.36.0               SummarizedExperiment_1.26.1
## [41] Biobase_2.56.0              MatrixGenerics_1.8.1       
## [43] matrixStats_0.63.0          GenomicRanges_1.48.0       
## [45] forcats_0.5.2               purrr_0.3.5                
## [47] readr_2.1.3                 tibble_3.1.8               
## [49] tidyverse_1.3.2             RColorBrewer_1.1-3         
## [51] ggh4x_0.2.3                 ggrepel_0.9.2              
## [53] reshape_0.8.9               reshape2_1.4.4             
## [55] ggpubr_0.5.0                gridExtra_2.3              
## [57] gplots_3.1.3                DECIPHER_2.24.0            
## [59] RSQLite_2.2.19              Biostrings_2.64.1          
## [61] GenomeInfoDb_1.32.4         XVector_0.36.0             
## [63] IRanges_2.30.1              S4Vectors_0.34.0           
## [65] BiocGenerics_0.42.0         bsselectR_0.1.0            
## [67] devtools_2.4.5              usethis_2.1.6              
## [69] crosstalk_1.2.0             leaflet_2.1.1              
## [71] networkD3_0.4               dygraphs_1.1.1.6           
## [73] ggiraph_0.8.4               plotly_4.10.1              
## [75] highcharter_0.9.4           ggplot2_3.4.0              
## [77] DT_0.26                     formattable_0.2.1          
## [79] kableExtra_1.3.4            stringr_1.4.1              
## [81] tidyr_1.2.1                 dplyr_1.0.10               
## [83] edgeR_3.38.4                limma_3.52.4               
## [85] clusterProfiler_4.4.4       captioner_2.2.3            
## [87] bookdown_0.30               knitr_1.41                 
## 
## loaded via a namespace (and not attached):
##   [1] graphlayouts_0.8.4            lattice_0.20-45              
##   [3] haven_2.5.1                   vctrs_0.5.1                  
##   [5] Category_2.62.0               RBGL_1.72.0                  
##   [7] blob_1.2.3                    survival_3.4-0               
##   [9] later_1.3.0                   R.utils_2.12.2               
##  [11] DBI_1.1.3                     rappdirs_0.3.3               
##  [13] dqrng_0.3.0                   jpeg_0.1-9                   
##  [15] zlibbioc_1.42.0               OrganismDbi_1.38.1           
##  [17] htmlwidgets_1.5.4             mvtnorm_1.1-3                
##  [19] leaps_3.1                     irlba_2.3.5.1                
##  [21] markdown_1.4                  tidygraph_1.2.2              
##  [23] Rcpp_1.0.9                    KernSmooth_2.23-20           
##  [25] promises_1.2.0.1              DelayedArray_0.22.0          
##  [27] pkgload_1.3.2                 graph_1.74.0                 
##  [29] Hmisc_4.7-2                   fs_1.5.2                     
##  [31] fastmatch_1.1-3               digest_0.6.30                
##  [33] png_0.1-7                     scatterpie_0.1.8             
##  [35] cowplot_1.1.1                 DOSE_3.22.1                  
##  [37] ggraph_2.1.0                  pkgconfig_2.0.3              
##  [39] GO.db_3.15.0                  DelayedMatrixStats_1.18.2    
##  [41] estimability_1.4.1            emdbook_1.3.12               
##  [43] beeswarm_0.4.0                xfun_0.35                    
##  [45] bslib_0.4.1                   zoo_1.8-11                   
##  [47] tidyselect_1.2.0              rtracklayer_1.56.1           
##  [49] pkgbuild_1.4.0                rlang_1.0.6                  
##  [51] jquerylib_0.1.4               glue_1.6.2                   
##  [53] gdtools_0.2.4                 ensembldb_2.20.2             
##  [55] modelr_0.1.10                 emmeans_1.8.2                
##  [57] multcompView_0.1-8            ggsignif_0.6.4               
##  [59] GGally_2.1.2                  httpuv_1.6.6                 
##  [61] Rttf2pt1_1.3.11               preprocessCore_1.58.0        
##  [63] TH.data_1.1-1                 DO.db_2.9                    
##  [65] annotate_1.74.0               webshot_0.5.4                
##  [67] jsonlite_1.8.3                bit_4.0.5                    
##  [69] mime_0.12                     systemfonts_1.0.4            
##  [71] stringi_1.7.8                 GenomicFiles_1.32.1          
##  [73] processx_3.8.0                yulab.utils_0.0.5            
##  [75] bitops_1.0-7                  cli_3.4.1                    
##  [77] data.table_1.14.6             timechange_0.1.1             
##  [79] rstudioapi_0.14               qvalue_2.28.0                
##  [81] locfit_1.5-9.6                VariantAnnotation_1.42.1     
##  [83] miniUI_0.1.1.1                gridGraphics_0.5-1           
##  [85] R.oo_1.25.0                   urlchecker_1.0.1             
##  [87] dbplyr_2.2.1                  sessioninfo_1.2.2            
##  [89] TTR_0.24.3                    readxl_1.4.1                 
##  [91] lifecycle_1.0.3               quantmod_0.4.20              
##  [93] munsell_0.5.0                 cellranger_1.1.0             
##  [95] R.methodsS3_1.8.2             hwriter_1.3.2.1              
##  [97] caTools_1.18.2                codetools_0.2-18             
##  [99] coda_0.19-4                   vipor_0.4.5                  
## [101] htmlTable_2.4.1               xtable_1.8-4                 
## [103] flashClust_1.01-2             googlesheets4_1.0.1          
## [105] BiocManager_1.30.19           scatterplot3d_0.3-42         
## [107] abind_1.4-5                   farver_2.1.1                 
## [109] AnnotationHub_3.4.0           aplot_0.1.9                  
## [111] biovizBase_1.44.0             ggtree_3.4.4                 
## [113] BiocIO_1.6.0                  ggbio_1.44.1                 
## [115] patchwork_1.1.2               profvis_0.3.7                
## [117] dichromat_2.0-0.1             cluster_2.1.4                
## [119] extrafontdb_1.0               Matrix_1.5-3                 
## [121] tidytree_0.4.1                ellipsis_0.3.2               
## [123] prettyunits_1.1.1             lubridate_1.9.0              
## [125] googledrive_2.0.0             reprex_2.0.2                 
## [127] igraph_1.3.5                  fgsea_1.22.0                 
## [129] remotes_2.4.2                 gargle_1.2.1                 
## [131] htmltools_0.5.3               BiocFileCache_2.4.0          
## [133] yaml_2.3.6                    GenomicFeatures_1.48.4       
## [135] utf8_1.2.2                    interactiveDisplayBase_1.34.0
## [137] XML_3.99-0.12                 foreign_0.8-83               
## [139] withr_2.5.0                   GOstats_2.62.0               
## [141] aroma.light_3.26.0            bit64_4.0.5                  
## [143] affyio_1.66.0                 multcomp_1.4-20              
## [145] ProtGenerics_1.28.0           GOSemSim_2.22.0              
## [147] rsvd_1.0.5                    ScaledMatrix_1.4.1           
## [149] memoise_2.0.1                 evaluate_0.18                
## [151] extrafont_0.18                geneplotter_1.74.0           
## [153] tzdb_0.3.0                    callr_3.7.3                  
## [155] ps_1.7.2                      curl_4.3.3                   
## [157] fansi_1.0.3                   highr_0.9                    
## [159] GSEABase_1.58.0               xts_0.12.2                   
## [161] checkmate_2.1.0               cachem_1.0.6                 
## [163] interp_1.1-3                  deldir_1.0-6                 
## [165] rjson_0.2.21                  rstatix_0.7.1                
## [167] rlist_0.4.6.2                 tools_4.2.0                  
## [169] sass_0.4.4                    sandwich_3.0-2               
## [171] RCurl_1.98-1.9                car_3.1-1                    
## [173] ape_5.6-2                     ggplotify_0.1.0              
## [175] xml2_1.3.3                    httr_1.4.4                   
## [177] assertthat_0.2.1              rmarkdown_2.18               
## [179] R6_2.5.1                      AnnotationFilter_1.20.0      
## [181] nnet_7.3-18                   progress_1.2.2               
## [183] tximport_1.24.0               KEGGREST_1.36.3              
## [185] treeio_1.20.2                 gtools_3.9.4                 
## [187] beachmat_2.12.0               BiocVersion_3.15.2           
## [189] BiocSingular_1.12.0           splines_4.2.0                
## [191] carData_3.0-5                 ggfun_0.0.9                  
## [193] colorspace_2.0-3              generics_0.1.3               
## [195] base64enc_0.1-3               pillar_1.8.1                 
## [197] Rgraphviz_2.40.0              tweenr_2.0.2                 
## [199] affy_1.74.0                   uuid_1.1-0                   
## [201] GenomeInfoDbData_1.2.8        plyr_1.8.8                   
## [203] gtable_0.3.1                  bdsmatrix_1.3-6              
## [205] rvest_1.0.3                   restfulr_0.0.15              
## [207] latticeExtra_0.6-30           shadowtext_0.1.2             
## [209] fastmap_1.1.0                 broom_1.0.1                  
## [211] BSgenome_1.64.0               scales_1.2.1                 
## [213] filelock_1.0.2                backports_1.4.1              
## [215] enrichplot_1.16.2             hms_1.1.2                    
## [217] ggforce_0.4.1                 shiny_1.7.3                  
## [219] polyclip_1.10-4               numDeriv_2016.8-1.1          
## [221] bbmle_1.0.25                  lazyeval_0.2.2               
## [223] Formula_1.2-4                 crayon_1.5.2                 
## [225] MASS_7.3-58.1                 downloader_0.4               
## [227] PFAM.db_3.15.0                sparseMatrixStats_1.8.0      
## [229] svglite_2.1.0                 AnnotationForge_1.38.1       
## [231] rpart_4.1.19                  compiler_4.2.0

9 Practical Info

9.1 Data responsibility

The responsibility for data archiving lies with the PI of the project. We do not offer long-term storage or retrieval of data.

  • NBIS & Uppnex: We kindly ask that you remove the files from UPPMAX/UPPNEX. The main storage at UPPNEX is optimized for high-speed and parallel access, which makes it expensive and not the right place for longer time archiving. Please consider others by not taking up the expensive space. Please note that UPPMAX is a resource separate from the Bioinformatics Platform, administered by the Swedish National Infrastructure for Computing (SNIC) and SNIC-specifc project rules apply to all projects hosted at UPPMAX.
  • Sensitive data : Please note that special considerations may apply to the human-derived legally considered sensitive personal data. These should be handled according to specific laws and regulations as outlined e.g. here.
  • Long-term backup : We recommend asking your local IT for support with long-term data archiving. Also a newly established Data Office at SciLifeLab may be of help to discuss other options.

9.2 Acknowledgments

If you are presenting the results in a paper, at a workshop or conference, we kindly ask you to acknowledge us.

“Support by NBIS (National Bioinformatics Infrastructure Sweden) is gratefully acknowledged.”

“The computations were performed on resources provided by SNIC through Uppsala Multidisciplinary Center for Advanced Computational Science (UPPMAX) under Project SNIC 2022-22-352.”

  • NGI : For publications based on data from NGI Sweden, NGI, SciLifeLab and UPPMAX should be acknowledged like so:

“The authors would like to acknowledge support from Science for Life Laboratory (SciLifeLab), the National Genomics Infrastructure (NGI), and Uppsala Multidisciplinary Center for Advanced Computational Science (UPPMAX) for providing assistance in massive parallel sequencing and computational infrastructure.”

10 Support Completion

You should soon be contacted by one of our managers with a request to close down the project in our internal system and for invoicing matters. If we do not hear from you within 30 days the project will be automatically closed and invoice sent. Again, we would like to remind you about data responsibility and acknowledgements, see sections: Data Responsibility and Acknowledgments.

You are welcome to come back to us with further data analysis request at any time via http://nbis.se/support/support.html.

Thank you for using NBIS.